import pandas as pd
import numpy as np
import seaborn as sns
sns.set_style('whitegrid')
import matplotlib.pyplot as plt
%matplotlib inline
import plotly
import plotly.express as px
import plotly.graph_objs as go
import plotly.offline as py
from plotly.offline import iplot
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
df = pd.read_csv('tmbd_file.csv')
print ("Number of columns", df.shape[1])
Number of columns 21
print ("Number of rows", df.shape[0])
Number of rows 10866
df.duplicated()
0 False
1 False
2 False
3 False
4 False
...
10861 False
10862 False
10863 False
10864 False
10865 False
Length: 10866, dtype: bool
df.describe()
| ID | POPULARITY | BUDGET | REVENUE | RUNTIME | VOTE_COUNT | VOTE_AVERAGE | RELEASE_YEAR | BUDGET_ADJ | REVENUE_ADJ | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 10866.000000 | 10866.000000 | 1.086600e+04 | 1.086600e+04 | 10866.000000 | 10866.000000 | 10866.000000 | 10866.000000 | 1.086600e+04 | 1.086600e+04 |
| mean | 66064.177434 | 0.646441 | 1.462570e+07 | 3.982332e+07 | 102.070863 | 217.389748 | 5.974922 | 2001.322658 | 1.755104e+07 | 5.136436e+07 |
| std | 92130.136561 | 1.000185 | 3.091321e+07 | 1.170035e+08 | 31.381405 | 575.619058 | 0.935142 | 12.812941 | 3.430616e+07 | 1.446325e+08 |
| min | 5.000000 | 0.000065 | 0.000000e+00 | 0.000000e+00 | 0.000000 | 10.000000 | 1.500000 | 1960.000000 | 0.000000e+00 | 0.000000e+00 |
| 25% | 10596.250000 | 0.207583 | 0.000000e+00 | 0.000000e+00 | 90.000000 | 17.000000 | 5.400000 | 1995.000000 | 0.000000e+00 | 0.000000e+00 |
| 50% | 20669.000000 | 0.383856 | 0.000000e+00 | 0.000000e+00 | 99.000000 | 38.000000 | 6.000000 | 2006.000000 | 0.000000e+00 | 0.000000e+00 |
| 75% | 75610.000000 | 0.713817 | 1.500000e+07 | 2.400000e+07 | 111.000000 | 145.750000 | 6.600000 | 2011.000000 | 2.085325e+07 | 3.369710e+07 |
| max | 417859.000000 | 32.985763 | 4.250000e+08 | 2.781506e+09 | 900.000000 | 9767.000000 | 9.200000 | 2015.000000 | 4.250000e+08 | 2.827124e+09 |
df.drop(columns=['ID','TAGLINE','HOMEPAGE','BUDGET_ADJ','REVENUE_ADJ','IMDB ID'],axis=1,inplace=True)
df["Date"]= pd.to_datetime(df.REALEASE_DATE)
df["Date"]
df["Year"] = df["Date"].dt.year
df["Month"] = df["Date"].dt.month
df["Day_of_week"] = df["Date"].dt.dayofweek
df.head()
| POPULARITY | BUDGET | REVENUE | ORIGINAL TITLE | CAST | DIRECTOR | KEYWORDS | OVERVIEW | RUNTIME | GENRES | PRODUCTION COMPANIES | REALEASE_DATE | VOTE_COUNT | VOTE_AVERAGE | RELEASE_YEAR | Date | Year | Month | Day_of_week | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 32.985763 | 150000000 | 1513528810 | Jurassic World | Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi... | Colin Trevorrow | monster|dna|tyrannosaurus rex|velociraptor|island | Twenty-two years after the events of Jurassic ... | 124 | Action|Adventure|Science Fiction|Thriller | Universal Studios|Amblin Entertainment|Legenda... | 6/9/15 | 5562 | 6.5 | 2015 | 2015-06-09 | 2015 | 6 | 1 |
| 1 | 28.419936 | 150000000 | 378436354 | Mad Max: Fury Road | Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic... | George Miller | future|chase|post-apocalyptic|dystopia|australia | An apocalyptic story set in the furthest reach... | 120 | Action|Adventure|Science Fiction|Thriller | Village Roadshow Pictures|Kennedy Miller Produ... | 5/13/15 | 6185 | 7.1 | 2015 | 2015-05-13 | 2015 | 5 | 2 |
| 2 | 13.112507 | 110000000 | 295238201 | Insurgent | Shailene Woodley|Theo James|Kate Winslet|Ansel... | Robert Schwentke | based on novel|revolution|dystopia|sequel|dyst... | Beatrice Prior must confront her inner demons ... | 119 | Adventure|Science Fiction|Thriller | Summit Entertainment|Mandeville Films|Red Wago... | 3/18/15 | 2480 | 6.3 | 2015 | 2015-03-18 | 2015 | 3 | 2 |
| 3 | 11.173104 | 200000000 | 2068178225 | Star Wars: The Force Awakens | Harrison Ford|Mark Hamill|Carrie Fisher|Adam D... | J.J. Abrams | android|spaceship|jedi|space opera|3d | Thirty years after defeating the Galactic Empi... | 136 | Action|Adventure|Science Fiction|Fantasy | Lucasfilm|Truenorth Productions|Bad Robot | 12/15/15 | 5292 | 7.5 | 2015 | 2015-12-15 | 2015 | 12 | 1 |
| 4 | 9.335014 | 190000000 | 1506249360 | Furious 7 | Vin Diesel|Paul Walker|Jason Statham|Michelle ... | James Wan | car race|speed|revenge|suspense|car | Deckard Shaw seeks revenge against Dominic Tor... | 137 | Action|Crime|Thriller | Universal Pictures|Original Film|Media Rights ... | 4/1/15 | 2947 | 7.3 | 2015 | 2015-04-01 | 2015 | 4 | 2 |
df.columns
Index(['POPULARITY', 'BUDGET', 'REVENUE', 'ORIGINAL TITLE', 'CAST', 'DIRECTOR',
'KEYWORDS', 'OVERVIEW', 'RUNTIME', 'GENRES', 'PRODUCTION COMPANIES',
'REALEASE_DATE', 'VOTE_COUNT', 'VOTE_AVERAGE', 'RELEASE_YEAR', 'Date',
'Year', 'Month', 'Day_of_week'],
dtype='object')
#null
df.isnull().sum().sum()
2670
df.groupby('REALEASE_DATE') ['VOTE_COUNT'].mean().sort_values(ascending=False)
REALEASE_DATE
7/14/10 9767.0
4/25/12 8903.0
11/26/12 6417.0
5/13/15 6185.0
10/14/99 5923.0
...
2/28/15 10.0
10/18/97 10.0
11/22/85 10.0
10/18/15 10.0
11/22/98 10.0
Name: VOTE_COUNT, Length: 5909, dtype: float64
df.columns
Index(['POPULARITY', 'BUDGET', 'REVENUE', 'ORIGINAL TITLE', 'CAST', 'DIRECTOR',
'KEYWORDS', 'OVERVIEW', 'RUNTIME', 'GENRES', 'PRODUCTION COMPANIES',
'REALEASE_DATE', 'VOTE_COUNT', 'VOTE_AVERAGE', 'RELEASE_YEAR', 'Date',
'Year', 'Month', 'Day_of_week'],
dtype='object')
#top 10 most popular movies
top_10 = df.sort_values(by = 'POPULARITY', ascending = False).head(10)
top_10
| POPULARITY | BUDGET | REVENUE | ORIGINAL TITLE | CAST | DIRECTOR | KEYWORDS | OVERVIEW | RUNTIME | GENRES | PRODUCTION COMPANIES | REALEASE_DATE | VOTE_COUNT | VOTE_AVERAGE | RELEASE_YEAR | Date | Year | Month | Day_of_week | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 32.985763 | 150000000 | 1513528810 | Jurassic World | Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi... | Colin Trevorrow | monster|dna|tyrannosaurus rex|velociraptor|island | Twenty-two years after the events of Jurassic ... | 124 | Action|Adventure|Science Fiction|Thriller | Universal Studios|Amblin Entertainment|Legenda... | 6/9/15 | 5562 | 6.5 | 2015 | 2015-06-09 | 2015 | 6 | 1 |
| 1 | 28.419936 | 150000000 | 378436354 | Mad Max: Fury Road | Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic... | George Miller | future|chase|post-apocalyptic|dystopia|australia | An apocalyptic story set in the furthest reach... | 120 | Action|Adventure|Science Fiction|Thriller | Village Roadshow Pictures|Kennedy Miller Produ... | 5/13/15 | 6185 | 7.1 | 2015 | 2015-05-13 | 2015 | 5 | 2 |
| 629 | 24.949134 | 165000000 | 621752480 | Interstellar | Matthew McConaughey|Jessica Chastain|Anne Hath... | Christopher Nolan | saving the world|artificial intelligence|fathe... | Interstellar chronicles the adventures of a gr... | 169 | Adventure|Drama|Science Fiction | Paramount Pictures|Legendary Pictures|Warner B... | 11/5/14 | 6498 | 8.0 | 2014 | 2014-11-05 | 2014 | 11 | 2 |
| 630 | 14.311205 | 170000000 | 773312399 | Guardians of the Galaxy | Chris Pratt|Zoe Saldana|Dave Bautista|Vin Dies... | James Gunn | marvel comic|spaceship|space|scene during end ... | Light years from Earth, 26 years after being a... | 121 | Action|Science Fiction|Adventure | Marvel Studios|Moving Picture Company (MPC)|Bu... | 7/30/14 | 5612 | 7.9 | 2014 | 2014-07-30 | 2014 | 7 | 2 |
| 2 | 13.112507 | 110000000 | 295238201 | Insurgent | Shailene Woodley|Theo James|Kate Winslet|Ansel... | Robert Schwentke | based on novel|revolution|dystopia|sequel|dyst... | Beatrice Prior must confront her inner demons ... | 119 | Adventure|Science Fiction|Thriller | Summit Entertainment|Mandeville Films|Red Wago... | 3/18/15 | 2480 | 6.3 | 2015 | 2015-03-18 | 2015 | 3 | 2 |
| 631 | 12.971027 | 170000000 | 714766572 | Captain America: The Winter Soldier | Chris Evans|Scarlett Johansson|Sebastian Stan|... | Joe Russo|Anthony Russo | washington d.c.|future|shield|marvel comic|comic | After the cataclysmic events in New York with ... | 136 | Action|Adventure|Science Fiction | Marvel Studios | 3/20/14 | 3848 | 7.6 | 2014 | 2014-03-20 | 2014 | 3 | 3 |
| 1329 | 12.037933 | 11000000 | 775398007 | Star Wars | Mark Hamill|Harrison Ford|Carrie Fisher|Peter ... | George Lucas | android|galaxy|hermit|death star|lightsaber | Princess Leia is captured and held hostage by ... | 121 | Adventure|Action|Science Fiction | Lucasfilm|Twentieth Century Fox Film Corporation | 3/20/77 | 4428 | 7.9 | 1977 | 1977-03-20 | 1977 | 3 | 6 |
| 632 | 11.422751 | 20000000 | 78739897 | John Wick | Keanu Reeves|Michael Nyqvist|Alfie Allen|Wille... | Chad Stahelski|David Leitch | hitman|revenge|murder|gangster|dog | After the sudden death of his beloved wife, Jo... | 101 | Action|Thriller | Thunder Road Pictures|Warner Bros.|87Eleven|De... | 10/22/14 | 2712 | 7.0 | 2014 | 2014-10-22 | 2014 | 10 | 2 |
| 3 | 11.173104 | 200000000 | 2068178225 | Star Wars: The Force Awakens | Harrison Ford|Mark Hamill|Carrie Fisher|Adam D... | J.J. Abrams | android|spaceship|jedi|space opera|3d | Thirty years after defeating the Galactic Empi... | 136 | Action|Adventure|Science Fiction|Fantasy | Lucasfilm|Truenorth Productions|Bad Robot | 12/15/15 | 5292 | 7.5 | 2015 | 2015-12-15 | 2015 | 12 | 1 |
| 633 | 10.739009 | 125000000 | 752100229 | The Hunger Games: Mockingjay - Part 1 | Jennifer Lawrence|Josh Hutcherson|Liam Hemswor... | Francis Lawrence | resistance|post-apocalyptic|dystopia|war|sequel | Katniss Everdeen reluctantly becomes the symbo... | 123 | Science Fiction|Adventure|Thriller | Lionsgate|Color Force | 11/18/14 | 3590 | 6.6 | 2014 | 2014-11-18 | 2014 | 11 | 1 |
top_10 = df[["VOTE_COUNT", "ORIGINAL TITLE", "GENRES", "Year"]].sort_values(["VOTE_COUNT"], ascending=False)[:10]
top_10
fig = px.scatter(top_10, y= 'ORIGINAL TITLE', x='VOTE_COUNT', hover_data = top_10[['GENRES', 'Year']], color='GENRES',
title = "Top 10 Most Popular Movies")
fig.show()
#top 10 least popular movies
top_10_least = df.sort_values(by = 'POPULARITY', ascending = True).head(10)
top_10_least
| POPULARITY | BUDGET | REVENUE | ORIGINAL TITLE | CAST | DIRECTOR | KEYWORDS | OVERVIEW | RUNTIME | GENRES | PRODUCTION COMPANIES | REALEASE_DATE | VOTE_COUNT | VOTE_AVERAGE | RELEASE_YEAR | Date | Year | Month | Day_of_week | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 6181 | 0.000065 | 0 | 0 | North and South, Book I | Patrick Swayze|Philip Casnoff|Kirstie Alley|Ge... | NaN | NaN | Two friends, one northern and one southern, st... | 561 | Drama|History|Western | NaN | 11/3/85 | 17 | 6.0 | 1985 | 1985-11-03 | 1985 | 11 | 6 |
| 9977 | 0.000188 | 0 | 0 | The Hospital | George C. Scott|Diana Rigg|Richard Dysart|Barn... | Arthur Hiller | hospital|malpratice | Black comedy in which a suicidal doctor strugg... | 103 | Mystery|Comedy|Drama | Simcha Productions | 12/14/71 | 10 | 6.4 | 1971 | 2071-12-14 | 2071 | 12 | 0 |
| 6080 | 0.000620 | 0 | 0 | G.B.F. | Michael J. Willett|Paul Iacono|Sasha Pieterse|... | Darren Stein | gay|gay kiss|coming out|high school|friends | The bitter fight for supremacy between the thr... | 92 | Comedy | School Pictures|Parting Shots Media|Logolite E... | 10/20/13 | 82 | 6.1 | 2013 | 2013-10-20 | 2013 | 10 | 6 |
| 6551 | 0.000973 | 0 | 0 | Mon petit doigt m'a dit... | Catherine Frot|André Dussollier|Geneviève Bu... | Pascal Thomas | NaN | No overview found. | 105 | Comedy|Mystery | Rhône-Alpes Cinéma|France2 Cinéma | 4/13/05 | 13 | 5.7 | 2005 | 2005-04-13 | 2005 | 4 | 2 |
| 6961 | 0.001115 | 0 | 0 | Khosla Ka Ghosla! | Anupam Kher|Boman Irani|Parvin Dabas|Tara Shar... | Dibakar Banerjee | bollywood | Upon retirement, cranky control freak Kamal Ki... | 135 | Comedy | UTV Motion Pictures | 9/22/06 | 10 | 6.8 | 2006 | 2006-09-22 | 2006 | 9 | 4 |
| 7268 | 0.001117 | 350000 | 3515061 | Born into Brothels | Zana Briski|Avijit|Geeta Masi|Kochi|Mamuni | Zana Briski|Ross Kauffman | prostitute|red-light disctrict|human trafficki... | The academy award winning documentary depictin... | 85 | Documentary | NaN | 12/8/04 | 23 | 6.4 | 2004 | 2004-12-08 | 2004 | 12 | 2 |
| 7256 | 0.001315 | 0 | 0 | Soupçons | Michael Peterson | Jean-Xavier de Lestrade | NaN | Academy Award-winning documentary filmmaker, J... | 360 | Drama|Documentary | NaN | 6/1/04 | 12 | 7.5 | 2004 | 2004-06-01 | 2004 | 6 | 1 |
| 3370 | 0.001317 | 0 | 0 | Fuera de carta | Javier Cámara|Lola Dueñas|Fernando Tejero|Be... | Nacho G. Velilla | NaN | Maxi thinks is life is perfect. He is a famous... | 111 | Comedy | TLA Releasing | 4/11/08 | 13 | 5.3 | 2008 | 2008-04-11 | 2008 | 4 | 4 |
| 2874 | 0.001349 | 165000 | 0 | Slashers | Tony Curtis Blondell | Maurice Devereaux | NaN | Japan's number one extreme reality show is hav... | 99 | Comedy|Horror|Fantasy | NaN | 1/1/01 | 11 | 5.5 | 2001 | 2001-01-01 | 2001 | 1 | 0 |
| 4948 | 0.001372 | 0 | 0 | The Invisible War | Kori Cioca|Jessica Hinves|Ariana Klay|Elle Hel... | Kirby Dick | post traumatic stress disorder|rape|sexism|ra... | An investigative and powerfully emotional docu... | 93 | Crime|Drama|History|Documentary | Canal+|Chain Camera Pictures|ITVS|Rise Films|G... | 1/20/12 | 17 | 7.0 | 2012 | 2012-01-20 | 2012 | 1 | 4 |
top_10 = df[["VOTE_COUNT", "ORIGINAL TITLE", "GENRES", "REALEASE_DATE"]].sort_values(["VOTE_COUNT"], ascending=True)[:10]
top_10
fig = px.scatter(top_10, y= 'ORIGINAL TITLE', x='VOTE_COUNT', hover_data = top_10[['GENRES', 'REALEASE_DATE']], color='GENRES',
title = "Top 10 Most Popular Movies")
fig.show()
#directers with the highest revenue
top_directors_revenue = df.sort_values(by='REVENUE',ascending = False)
top_directors_revenue[['ORIGINAL TITLE','DIRECTOR','REVENUE','VOTE_AVERAGE']].head(5)
| ORIGINAL TITLE | DIRECTOR | REVENUE | VOTE_AVERAGE | |
|---|---|---|---|---|
| 1386 | Avatar | James Cameron | 2781505847 | 7.1 |
| 3 | Star Wars: The Force Awakens | J.J. Abrams | 2068178225 | 7.5 |
| 5231 | Titanic | James Cameron | 1845034188 | 7.3 |
| 4361 | The Avengers | Joss Whedon | 1519557910 | 7.3 |
| 0 | Jurassic World | Colin Trevorrow | 1513528810 | 6.5 |
name = ['Avatar', 'Star Wars: The Force Awakens','Titanic','The Avengers','Jurassic World']
direc= ['James Cameron', 'J.J. Abrams','James Cameron','Joss Whedon','Colin Trevorrow']
rev= [2781505847, 2068178225, 1845034188, 1519557910, 1513528810]
vote= [7.1,7.5,7.3,7.3,6.5]
df = pd.DataFrame(list(zip(name, direc,rev,vote)),columns =['ORIGINAL TITLE','DIRECTOR','REVENUE','VOTE_AVERAGE'])
df
| ORIGINAL TITLE | DIRECTOR | REVENUE | VOTE_AVERAGE | |
|---|---|---|---|---|
| 0 | Avatar | James Cameron | 2781505847 | 7.1 |
| 1 | Star Wars: The Force Awakens | J.J. Abrams | 2068178225 | 7.5 |
| 2 | Titanic | James Cameron | 1845034188 | 7.3 |
| 3 | The Avengers | Joss Whedon | 1519557910 | 7.3 |
| 4 | Jurassic World | Colin Trevorrow | 1513528810 | 6.5 |
ypos= np.arange(len(rev))
ypos;
plt.bar(ypos,name, color=(0.5,0.0,0.5,1.0))
plt.title('Top 5 Movie Director ')
plt.xlabel('Revenue', size='x-small')
plt.ylabel('Movie Name', size='x-large');
lst = ['The Warriors Way', 'Pirates of the Caribbean: On Stranger Tides','Pirates of the Caribbean: At Worlds End','Avengers: Age of Ultron','Superman Returns']
lst2= [900, 877,705,566,561]
lst3= [9.2, 6.8, 8.0, 8.3, 6.0]
df = pd.DataFrame(list(zip(lst, lst2,lst3)),columns =['ORIGINAL TITLE','RUNTIME','VOTE_AVERAGE'])
df
| ORIGINAL TITLE | RUNTIME | VOTE_AVERAGE | |
|---|---|---|---|
| 0 | The Warriors Way | 900 | 9.2 |
| 1 | Pirates of the Caribbean: On Stranger Tides | 877 | 6.8 |
| 2 | Pirates of the Caribbean: At Worlds End | 705 | 8.0 |
| 3 | Avengers: Age of Ultron | 566 | 8.3 |
| 4 | Superman Returns | 561 | 6.0 |
fig = px.bar(data_frame=df, x="RUNTIME", title="Runtime of Movies")
fig.show()
plt.bar(ypos,lst, color=(0.5,0.0,0.0,1.0))
plt.title('Top 5 Movie Runtime ')
plt.xlabel('Runtime')
plt.ylabel('Movie Name');
title = ['The Warriors Way', 'Pirates of the Caribbean: On Stranger Tides','Pirates of the Caribbean: At Worlds End','Avengers: Age of Ultron','Superman Returns']
budget= [425000000, 380000000,300000000,280000000,270000000]
df = pd.DataFrame(list(zip(title, budget)),columns =['ORIGINAL TITLE','BUDGET' ])
df
| ORIGINAL TITLE | BUDGET | |
|---|---|---|
| 0 | The Warriors Way | 425000000 |
| 1 | Pirates of the Caribbean: On Stranger Tides | 380000000 |
| 2 | Pirates of the Caribbean: At Worlds End | 300000000 |
| 3 | Avengers: Age of Ultron | 280000000 |
| 4 | Superman Returns | 270000000 |
fig = px.histogram(data_frame=df, x="BUDGET",y="ORIGINAL TITLE",title="Movie Budget",opacity=0.8)
fig.show()